tutorials/017 - Partition Projection.ipynb (900 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 17 - Partition Projection\n", "\n", "https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import getpass\n", "from datetime import datetime\n", "\n", "import pandas as pd\n", "\n", "import awswrangler as wr" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ···········································\n" ] } ], "source": [ "bucket = getpass.getpass()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Integer projection" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>year</th>\n", " <th>month</th>\n", " <th>day</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2019</td>\n", " <td>10</td>\n", " <td>25</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>2020</td>\n", " <td>11</td>\n", " <td>26</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>2021</td>\n", " <td>12</td>\n", " <td>27</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value year month day\n", "0 1 2019 10 25\n", "1 2 2020 11 26\n", "2 3 2021 12 27" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"value\": [1, 2, 3], \"year\": [2019, 2020, 2021], \"month\": [10, 11, 12], \"day\": [25, 26, 27]})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_integer/\",\n", " dataset=True,\n", " partition_cols=[\"year\", \"month\", \"day\"],\n", " database=\"default\",\n", " table=\"table_integer\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\"year\": \"integer\", \"month\": \"integer\", \"day\": \"integer\"},\n", " \"projection_ranges\": {\"year\": \"2000,2025\", \"month\": \"1,12\", \"day\": \"1,31\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>year</th>\n", " <th>month</th>\n", " <th>day</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>3</td>\n", " <td>2021</td>\n", " <td>12</td>\n", " <td>27</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>2020</td>\n", " <td>11</td>\n", " <td>26</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>2019</td>\n", " <td>10</td>\n", " <td>25</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value year month day\n", "0 3 2021 12 27\n", "1 2 2020 11 26\n", "2 1 2019 10 25" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_integer\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Enum projection" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>city</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>São Paulo</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>Tokio</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>Seattle</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value city\n", "0 1 São Paulo\n", "1 2 Tokio\n", "2 3 Seattle" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"city\": [\"São Paulo\", \"Tokio\", \"Seattle\"],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_enum/\",\n", " dataset=True,\n", " partition_cols=[\"city\"],\n", " database=\"default\",\n", " table=\"table_enum\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"city\": \"enum\",\n", " },\n", " \"projection_values\": {\"city\": \"São Paulo,Tokio,Seattle\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>city</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>São Paulo</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " <td>Seattle</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>Tokio</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value city\n", "0 1 São Paulo\n", "1 3 Seattle\n", "2 2 Tokio" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_enum\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Date projection" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>dt</th>\n", " <th>ts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2020-01-01</td>\n", " <td>2020-01-01 00:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>2020-01-02</td>\n", " <td>2020-01-01 00:00:01</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>2020-01-03</td>\n", " <td>2020-01-01 00:00:02</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value dt ts\n", "0 1 2020-01-01 2020-01-01 00:00:00\n", "1 2 2020-01-02 2020-01-01 00:00:01\n", "2 3 2020-01-03 2020-01-01 00:00:02" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def ts(x):\n", " return datetime.strptime(x, \"%Y-%m-%d %H:%M:%S\")\n", "\n", "\n", "def dt(x):\n", " return datetime.strptime(x, \"%Y-%m-%d\").date()\n", "\n", "\n", "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"dt\": [dt(\"2020-01-01\"), dt(\"2020-01-02\"), dt(\"2020-01-03\")],\n", " \"ts\": [ts(\"2020-01-01 00:00:00\"), ts(\"2020-01-01 00:00:01\"), ts(\"2020-01-01 00:00:02\")],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_date/\",\n", " dataset=True,\n", " partition_cols=[\"dt\", \"ts\"],\n", " database=\"default\",\n", " table=\"table_date\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"dt\": \"date\",\n", " \"ts\": \"date\",\n", " },\n", " \"projection_ranges\": {\"dt\": \"2020-01-01,2020-01-03\", \"ts\": \"2020-01-01 00:00:00,2020-01-01 00:00:02\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>dt</th>\n", " <th>ts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2020-01-01</td>\n", " <td>2020-01-01 00:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>2020-01-02</td>\n", " <td>2020-01-01 00:00:01</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>2020-01-03</td>\n", " <td>2020-01-01 00:00:02</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value dt ts\n", "0 1 2020-01-01 2020-01-01 00:00:00\n", "1 2 2020-01-02 2020-01-01 00:00:01\n", "2 3 2020-01-03 2020-01-01 00:00:02" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_date\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Injected projection" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>uuid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>761e2488-a078-11ea-bb37-0242ac130002</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>b89ed095-8179-4635-9537-88592c0f6bc3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>87adc586-ce88-4f0a-b1c8-bf8e00d32249</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value uuid\n", "0 1 761e2488-a078-11ea-bb37-0242ac130002\n", "1 2 b89ed095-8179-4635-9537-88592c0f6bc3\n", "2 3 87adc586-ce88-4f0a-b1c8-bf8e00d32249" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"uuid\": [\n", " \"761e2488-a078-11ea-bb37-0242ac130002\",\n", " \"b89ed095-8179-4635-9537-88592c0f6bc3\",\n", " \"87adc586-ce88-4f0a-b1c8-bf8e00d32249\",\n", " ],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_injected/\",\n", " dataset=True,\n", " partition_cols=[\"uuid\"],\n", " database=\"default\",\n", " table=\"table_injected\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"uuid\": \"injected\",\n", " }\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>value</th>\n", " <th>uuid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2</td>\n", " <td>b89ed095-8179-4635-9537-88592c0f6bc3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value uuid\n", "0 2 b89ed095-8179-4635-9537-88592c0f6bc3" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\n", " sql=\"SELECT * FROM table_injected WHERE uuid='b89ed095-8179-4635-9537-88592c0f6bc3'\", database=\"default\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Cleaning Up" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.delete_objects(f\"s3://{bucket}/table_integer/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_enum/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_date/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_injected/\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.catalog.delete_table_if_exists(table=\"table_integer\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_enum\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_date\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_injected\", database=\"default\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" } }, "nbformat": 4, "nbformat_minor": 4 }